This notebook creates a Bag of Words embedding of the data set
library(magrittr)
package 㤼㸱magrittr㤼㸲 was built under R version 4.0.5
library(tidyverse)
package 㤼㸱tidyverse㤼㸲 was built under R version 4.0.5Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages --------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.1 --
v ggplot2 3.3.5 v purrr 0.3.4
v tibble 3.1.3 v dplyr 1.0.7
v tidyr 1.1.3 v stringr 1.4.0
v readr 2.0.1 v forcats 0.5.1
package 㤼㸱ggplot2㤼㸲 was built under R version 4.0.5package 㤼㸱tibble㤼㸲 was built under R version 4.0.5package 㤼㸱tidyr㤼㸲 was built under R version 4.0.5package 㤼㸱purrr㤼㸲 was built under R version 4.0.5package 㤼㸱dplyr㤼㸲 was built under R version 4.0.5package 㤼㸱stringr㤼㸲 was built under R version 4.0.5package 㤼㸱forcats㤼㸲 was built under R version 4.0.5-- Conflicts ------------------------------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
x tidyr::extract() masks magrittr::extract()
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
x purrr::set_names() masks magrittr::set_names()
library(caret)
package 㤼㸱caret㤼㸲 was built under R version 4.0.5Loading required package: lattice
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Attaching package: 㤼㸱caret㤼㸲
The following object is masked from 㤼㸱package:purrr㤼㸲:
lift
library(tictoc)
package 㤼㸱tictoc㤼㸲 was built under R version 4.0.5
source("../bow_creation.R")
package 㤼㸱tm㤼㸲 was built under R version 4.0.5Loading required package: NLP
Attaching package: 㤼㸱NLP㤼㸲
The following object is masked from 㤼㸱package:ggplot2㤼㸲:
annotate
source("./parameters.R")
# Rich error reporting
options(error = function() {
sink(stderr())
on.exit(sink(NULL))
traceback(3, max.lines = 1L)
if (!interactive()) {
q(status = 1)
}
})
These are the main parameters used for the generation of the Bag of Words.
# Number of lines sampled from the data set (to reduce computing times during the exercise)
lines_sampled = "balanced"
# Minimal number of occurrences of a word in the corpus to be taken into the bag of words
min_word_occurence = 100
# Minimal number of occurrences of a nGram in the corpus to be taken into the bag of words
min_ngram_occurence = 1000
# Shall we use bigrams, trigrams, more?
nGrams=2
# Weighting function used for the Bag of Words
# Possible values: "bin", "tf", "tfidf"
weighting = "tfidf"
# Language of the sentences ("en", "fr"...)
language="en"
# Cutoff for the ratio of correlation. Features over-correlated are deleted.
cutoff = 1
# Shall we cheat and remove all lines full of zeros in the Bag of Words?
remove_zeros = TRUE
# Initialize a time counter
tic("Time to run all")
# Open the CSV
df = read_csv("cleaned.csv",col_types=col_types)
# For the purpose of speeding the experimentation we will work on a sample of the data frame only
set.seed(42)
if (lines_sampled == "balanced") {
# Split the data set between toxic and non-toxic
df_toxic = df[df$toxic == 1,]
df_ok = df[df$toxic == 0,]
# Since non toxic is around 10 time bigger than toxic, sample it to the same size
df_ok_sampled = df_ok[sample(nrow(df_ok), nrow(df_toxic)), ]
# Merge back the two data frames per row
df = bind_rows(df_ok_sampled,df_toxic)
} else if (lines_sampled > 0) {
df = df[sample(nrow(df), lines_sampled), ]
}
df
Get the weighting function to use according to the tag
# Possible weighting functions
weighting_functions = vector(mode="list", length=3)
weighting_functions$bin = weightBin
weighting_functions$tf = weightTf
weighting_functions$tfidf = weightTfIdf
weighting_function = weighting_functions[weighting]
# weighting_function
Run the function to get a bag of words
# Build the bag of words
tic("Bag of words creation")
bow = bag_of_words(df,"comment_text",min_word_occurence,min_ngram_occurence,nGrams,weighting_function,language)
[1] "VCorpus:"
<<VCorpus>>
Metadata: corpus specific: 0, document level (indexed): 0
Content: documents: 40950
[1] "Dimensions of the words matrice:"
[1] 40950 1318
[1] "Dimensions of the nGrams matrice:"
[1] 40950 90
[1] "Dimensions of the features matrice:"
[1] 40950 1408
toc()
Bag of words creation: 138.19 sec elapsed
bow